import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.decomposition import PCA
from sklearn import manifold
from sklearn.preprocessing import StandardScaler
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
df = pd.read_csv('Clenead/exploratory_data_analisis.csv')
df_outlier = pd.read_csv('Clenead/exploratory_data_analisis2.csv')
df
| customer_unique_id | number_order_item | review_score | average_payment_order | last_purchase_by_order | late_delivered_days | last_purchase_days | |
|---|---|---|---|---|---|---|---|
| 0 | 0000366f3b9a7992bf8c76cfdf3221e2 | 1 | 5.0 | 141.90 | 2018-05-10 10:56:27 | -5 | 111 |
| 1 | 0000b849f77a49e4a4ce2b2a4ca5be3f | 1 | 4.0 | 27.19 | 2018-05-07 11:11:27 | -5 | 114 |
| 2 | 0000f46a3911fa3c0805444483337064 | 1 | 3.0 | 86.22 | 2017-03-10 21:05:03 | -2 | 536 |
| 3 | 0000f6ccb0745a6a4b88665a16c9f078 | 1 | 4.0 | 43.62 | 2017-10-12 20:29:41 | -12 | 320 |
| 4 | 0004aac84e0df4da2b147fca70cf8255 | 1 | 5.0 | 196.89 | 2017-11-14 19:45:42 | -8 | 287 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 91455 | fffcf5a5ff07b0908bd4e2dbc735a684 | 2 | 5.0 | 1033.71 | 2017-06-08 21:00:36 | -27 | 446 |
| 91456 | fffea47cd6d3cc0a88bd621562a9d061 | 1 | 4.0 | 84.58 | 2017-12-10 20:07:56 | -3 | 261 |
| 91457 | ffff371b4d645b6ecea244b27531430a | 1 | 5.0 | 112.46 | 2017-02-07 15:49:16 | -30 | 567 |
| 91458 | ffff5962728ec6157033ef9805bacc48 | 1 | 5.0 | 133.69 | 2018-05-02 15:17:41 | -14 | 118 |
| 91459 | ffffd2657e2aad2907e67c3e9daecbeb | 1 | 5.0 | 71.56 | 2017-05-02 20:18:45 | -16 | 483 |
91460 rows × 7 columns
index = df.index
features = df.select_dtypes(include=np.number).columns
X = df.select_dtypes(include=np.number)
# standardisation des données
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled
array([[-0.28133371, 0.66078254, -0.0204294 , 0.68020428, -0.82723195],
[-0.28133371, -0.12096433, -0.60587711, 0.68020428, -0.8075944 ],
[-0.28133371, -0.9027112 , -0.30460452, 0.97651857, 1.95475301],
...,
[-0.28133371, 0.66078254, -0.17068292, -1.78908154, 2.15767426],
[-0.28133371, 0.66078254, -0.06233094, -0.20873862, -0.78141102],
[-0.28133371, 0.66078254, -0.37942506, -0.40628148, 1.60782312]])
X_scaled.shape
(91460, 5)
n_components = 4
pca = PCA(n_components=0.90)
pca.fit(X_scaled)
PCA(n_components=0.9)
pca.explained_variance_ratio_
array([0.25724762, 0.21409974, 0.20090603, 0.1885338 , 0.1392128 ])
scree = (pca.explained_variance_ratio_*100).round(2)
scree
array([25.72, 21.41, 20.09, 18.85, 13.92])
scree_cum = scree.cumsum().round()
scree_cum
array([ 26., 47., 67., 86., 100.])
x_list = range(1, 6)
list(x_list)
[1, 2, 3, 4, 5]
# ce graphique représente l'inertie totale sur les 5 axes principaux
plt.bar(x_list, scree)
plt.plot(x_list, scree_cum,c="red",marker='o')
plt.xlabel("rang de l'axe d'inertie")
plt.ylabel("pourcentage d'inertie")
plt.title("Eboulis des valeurs propres")
plt.show(block=False)
pca.explained_variance_
array([1.28625217, 1.07051042, 1.00454115, 0.94267931, 0.69607162])
pcs = pca.components_
pcs
array([[ 0.11881488, -0.67685151, -0.02674377, 0.69437992, -0.21184039],
[ 0.71760948, -0.18949429, -0.41292642, -0.17106079, 0.49935812],
[-0.03370996, -0.24431761, 0.77173819, -0.02398148, 0.58567705],
[ 0.64449682, 0.08309642, 0.47633506, -0.18281168, -0.56338586],
[ 0.2332534 , 0.66284827, 0.07938277, 0.67422512, 0.21294103]])
pcs = pd.DataFrame(pcs)
pcs
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| 0 | 0.118815 | -0.676852 | -0.026744 | 0.694380 | -0.211840 |
| 1 | 0.717609 | -0.189494 | -0.412926 | -0.171061 | 0.499358 |
| 2 | -0.033710 | -0.244318 | 0.771738 | -0.023981 | 0.585677 |
| 3 | 0.644497 | 0.083096 | 0.476335 | -0.182812 | -0.563386 |
| 4 | 0.233253 | 0.662848 | 0.079383 | 0.674225 | 0.212941 |
pcs.columns = features
pcs.index = [f"F{i}" for i in x_list]
pcs.round(2)
| number_order_item | review_score | average_payment_order | late_delivered_days | last_purchase_days | |
|---|---|---|---|---|---|
| F1 | 0.12 | -0.68 | -0.03 | 0.69 | -0.21 |
| F2 | 0.72 | -0.19 | -0.41 | -0.17 | 0.50 |
| F3 | -0.03 | -0.24 | 0.77 | -0.02 | 0.59 |
| F4 | 0.64 | 0.08 | 0.48 | -0.18 | -0.56 |
| F5 | 0.23 | 0.66 | 0.08 | 0.67 | 0.21 |
pcs.T
| F1 | F2 | F3 | F4 | F5 | |
|---|---|---|---|---|---|
| number_order_item | 0.118815 | 0.717609 | -0.033710 | 0.644497 | 0.233253 |
| review_score | -0.676852 | -0.189494 | -0.244318 | 0.083096 | 0.662848 |
| average_payment_order | -0.026744 | -0.412926 | 0.771738 | 0.476335 | 0.079383 |
| late_delivered_days | 0.694380 | -0.171061 | -0.023981 | -0.182812 | 0.674225 |
| last_purchase_days | -0.211840 | 0.499358 | 0.585677 | -0.563386 | 0.212941 |
fig, ax = plt.subplots(figsize=(20, 6))
sns.heatmap(pcs.T, vmin=-1, vmax=1, annot=True, cmap="coolwarm", fmt="0.2f")
<AxesSubplot:>
def cercle_corelation(pca,x,y):
fig, ax = plt.subplots(figsize=(10, 9))
for i in range(0, pca.components_.shape[1]):
ax.arrow(0,
0, # Start the arrow at the origin
pca.components_[x, i], #0 for PC1
pca.components_[y, i], #1 for PC2
head_width=0.07,
head_length=0.07,
width=0.02, )
plt.text(pca.components_[0, i] + 0.05,
pca.components_[1, i] + 0.05,
features[i])
# affichage des lignes horizontales et verticales
plt.plot([-1, 1], [0, 0], color='grey', ls='--')
plt.plot([0, 0], [-1, 1], color='grey', ls='--')
# nom des axes, avec le pourcentage d'inertie expliqué
plt.xlabel('F{} ({}%)'.format(x+1, round(100*pca.explained_variance_ratio_[x],1)))
plt.ylabel('F{} ({}%)'.format(y+1, round(100*pca.explained_variance_ratio_[y],1)))
plt.title("Cercle des corrélations (F{} et F{})".format(x+1, y+1))
an = np.linspace(0, 2 * np.pi, 100)
plt.plot(np.cos(an), np.sin(an)) # Add a unit circle for scale
plt.axis('equal')
plt.show(block=False)
def correlation_graph(pca,
x_y,
features) :
"""Affiche le graphe des correlations
Positional arguments :
-----------------------------------
pca : sklearn.decomposition.PCA : notre objet PCA qui a été fit
x_y : list ou tuple : le couple x,y des plans à afficher, exemple [0,1] pour F1, F2
features : list ou tuple : la liste des features (ie des dimensions) à représenter
"""
# Extrait x et y
x,y=x_y
# Taille de l'image (en inches)
fig, ax = plt.subplots(figsize=(8, 7))
# Pour chaque composante :
for i in range(0, pca.components_.shape[1]):
# Les flèches
ax.arrow(0,0,
pca.components_[x, i],
pca.components_[y, i],
head_width=0.07,
head_length=0.07,
width=0.02, )
# Les labels
plt.text(pca.components_[x, i] + 0.05,
pca.components_[y, i] + 0.05,
features[i])
#plt.text(pca.components_[x, i] + 0.05, pca.components_[y, i] + 0.05, fontsize='14', horizontalalignment='center',verticalalignment='center')
# Affichage des lignes horizontales et verticales
plt.plot([-1, 1], [0, 0], color='grey', ls='--')
plt.plot([0, 0], [-1, 1], color='grey', ls='--')
# Nom des axes, avec le pourcentage d'inertie expliqué
plt.xlabel('F{} ({}%)'.format(x+1, round(100*pca.explained_variance_ratio_[x],1)))
plt.ylabel('F{} ({}%)'.format(y+1, round(100*pca.explained_variance_ratio_[y],1)))
# J'ai copié collé le code sans le lire
plt.title("Cercle des corrélations (F{} et F{})".format(x+1, y+1))
# Le cercle
an = np.linspace(0, 2 * np.pi, 100)
plt.plot(np.cos(an), np.sin(an)) # Add a unit circle for scale
# Axes et display
plt.axis('equal')
plt.show(block=False)
X_proj = pca.transform(X_scaled)
print(X_proj.shape)
pcs_pi = pd.DataFrame(X_proj,columns=pcs.index)
pcs_pi[pcs_pi['F1']>1]
(91460, 5)
| F1 | F2 | F3 | F4 | F5 | |
|---|---|---|---|---|---|
| 7 | 1.974028 | 0.768192 | 0.606183 | 0.701988 | -1.349849 |
| 9 | 4.674222 | -0.648644 | 0.184721 | -0.947726 | 1.053813 |
| 24 | 1.097449 | -0.436968 | -0.560411 | -0.420235 | 0.659344 |
| 30 | 1.313328 | 0.551239 | 0.955382 | -0.592607 | -1.795524 |
| 36 | 1.640504 | 0.323156 | -0.765799 | 1.132243 | -1.197903 |
| ... | ... | ... | ... | ... | ... |
| 91420 | 2.041700 | 0.990251 | 1.773347 | -1.644348 | -0.492920 |
| 91434 | 1.548224 | 0.206268 | 3.994929 | -0.547355 | -0.539620 |
| 91436 | 3.373446 | -0.593088 | -0.594117 | -0.197623 | -0.639665 |
| 91438 | 2.646996 | 0.115925 | 0.068528 | 0.741527 | 0.127113 |
| 91444 | 4.714721 | -0.704325 | 0.413924 | -0.919716 | 1.160219 |
13223 rows × 5 columns
pca.inverse_transform(X_proj).shape
(91460, 5)
def display_factorial_planes( X_projected,
x_y,
pca=None,
labels = None,
clusters=None,
alpha=1,
figsize=[8,7],
marker="." ):
"""
Affiche la projection des individus
Positional arguments :
-------------------------------------
X_projected : np.array, pd.DataFrame, list of list : la matrice des points projetés
x_y : list ou tuple : le couple x,y des plans à afficher, exemple [0,1] pour F1, F2
Optional arguments :
-------------------------------------
pca : sklearn.decomposition.PCA : un objet PCA qui a été fit, cela nous permettra d'afficher la variance de chaque composante, default = None
labels : list ou tuple : les labels des individus à projeter, default = None
clusters : list ou tuple : la liste des clusters auquel appartient chaque individu, default = None
alpha : float in [0,1] : paramètre de transparence, 0=100% transparent, 1=0% transparent, default = 1
figsize : list ou tuple : couple width, height qui définit la taille de la figure en inches, default = [10,8]
marker : str : le type de marker utilisé pour représenter les individus, points croix etc etc, default = "."
"""
# Transforme X_projected en np.array
X_ = np.array(X_projected)
# On définit la forme de la figure si elle n'a pas été donnée
if not figsize:
figsize = (7,6)
# On gère les labels
if labels is None :
labels = []
try :
len(labels)
except Exception as e :
raise e
# On vérifie la variable axis
if not len(x_y) ==2 :
raise AttributeError("2 axes sont demandées")
if max(x_y )>= X_.shape[1] :
raise AttributeError("la variable axis n'est pas bonne")
# on définit x et y
x, y = x_y
# Initialisation de la figure
fig, ax = plt.subplots(1, 1, figsize=figsize)
# On vérifie s'il y a des clusters ou non
c = None if clusters is None else clusters
# Les points
# plt.scatter( X_[:, x], X_[:, y], alpha=alpha,
# c=c, cmap="Set1", marker=marker)
sns.scatterplot(data=None, x=X_[:, x], y=X_[:, y], hue=c)
# Si la variable pca a été fournie, on peut calculer le % de variance de chaque axe
if pca :
v1 = str(round(100*pca.explained_variance_ratio_[x])) + " %"
v2 = str(round(100*pca.explained_variance_ratio_[y])) + " %"
else :
v1=v2= ''
# Nom des axes, avec le pourcentage d'inertie expliqué
ax.set_xlabel(f'F{x+1} {v1}')
ax.set_ylabel(f'F{y+1} {v2}')
# Valeur x max et y max
x_max = np.abs(X_[:, x]).max() *1.1
y_max = np.abs(X_[:, y]).max() *1.1
# On borne x et y
ax.set_xlim(left=-x_max, right=x_max)
ax.set_ylim(bottom= -y_max, top=y_max)
# Affichage des lignes horizontales et verticales
plt.plot([-x_max, x_max], [0, 0], color='grey', alpha=0.8)
plt.plot([0,0], [-y_max, y_max], color='grey', alpha=0.8)
# Affichage des labels des points
if len(labels) :
# j'ai copié collé la fonction sans la lire
for i,(_x,_y) in enumerate(X_[:,[x,y]]):
print(labels[i])
plt.text(_x, _y+0.05, labels[i], fontsize='14', horizontalalignment='center',verticalalignment='center')
# Titre et display
plt.title(f"Projection des individus (sur F{x+1} et F{y+1})")
plt.show()
# les variables energy, fat et saturated fat sont liés. En effet, plsu l'aliment est gras plus il aura de l'energier.
x, y = 0,1
correlation_graph(pca,(x,y),features)
df_pca = pd.DataFrame(X_proj,columns=['F1','F2','F3','F4','F5'])
#X = df.select_dtypes(include = np.number)
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(X)
#tsne = manifold.TSNE(n_components=2, perplexity=40, init='pca', learning_rate = 200.0)
#X_tsne = tsne.fit_transform(X_scaled)
#X_tsne
from sklearn.cluster import KMeans
k = range(1,6)
inertia = []
for i in k:
model = KMeans(n_clusters=i)
model.fit(X_proj)
inertia.append(model.inertia_)
inertia
[457300.0000000003, 376392.85208271566, 319234.6828643871, 271072.79091752094, 230724.12159447552]
fig, ax = plt.subplots(1,1,figsize=(12,6))
ax.set_ylabel("intertia")
ax.set_xlabel("n_cluster")
ax = plt.plot(k, inertia)
#df['cluster'] = model.labels_
model = KMeans(n_clusters=4)
model.fit(df_pca)
KMeans(n_clusters=4)
df_pca['cluster'] = model.labels_
df_pca
| F1 | F2 | F3 | F4 | F5 | cluster | |
|---|---|---|---|---|---|---|
| 0 | 0.167429 | -0.848108 | -0.668526 | 0.205560 | 0.653214 | 0 |
| 1 | 0.708053 | -0.448418 | -0.917843 | -0.149333 | 0.092742 | 0 |
| 2 | 0.849700 | 0.904028 | 1.116393 | -1.681224 | 0.386478 | 1 |
| 3 | -0.059939 | 0.308584 | -0.046796 | -0.742689 | -0.079622 | 1 |
| 4 | -0.289886 | -0.338014 | 0.229911 | -0.255644 | 0.721033 | 1 |
| ... | ... | ... | ... | ... | ... | ... |
| 91455 | -1.786990 | -0.410982 | 4.140574 | 2.298704 | 0.292643 | 3 |
| 91456 | 0.633548 | -0.122655 | -0.132975 | -0.588038 | 0.454082 | 0 |
| 91457 | -2.175498 | 1.126871 | 1.022926 | -1.096250 | -0.387959 | 1 |
| 91458 | -0.458421 | -0.655861 | -0.652709 | 0.322295 | 0.060297 | 0 |
| 91459 | -1.093247 | 0.701951 | 0.506634 | -1.138695 | 0.410703 | 1 |
91460 rows × 6 columns
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
visualizer = KElbowVisualizer(model, k=(2,8))
visualizer.fit(X_proj) # Fit the data to the visualizer
visualizer.poof() # Draw/show/poof the data
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
# Instantiate the clustering model and visualizer
model = KMeans(4)
visualizer = SilhouetteVisualizer(model)
visualizer.fit(X_proj) # Fit the data to the visualizer
visualizer.poof() # Draw/show/poof the data
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 91460 Samples in 4 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
#df['cluster'] = model.labels_
df_clust = df.select_dtypes(include=np.number)
model = KMeans(n_clusters=4)
model.fit(df_clust)
KMeans(n_clusters=4)
df_clust['cluster' ] = model.labels_
df_clust
| number_order_item | review_score | average_payment_order | late_delivered_days | last_purchase_days | cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 5.0 | 141.90 | -5 | 111 | 1 |
| 1 | 1 | 4.0 | 27.19 | -5 | 114 | 1 |
| 2 | 1 | 3.0 | 86.22 | -2 | 536 | 0 |
| 3 | 1 | 4.0 | 43.62 | -12 | 320 | 0 |
| 4 | 1 | 5.0 | 196.89 | -8 | 287 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 91455 | 2 | 5.0 | 1033.71 | -27 | 446 | 3 |
| 91456 | 1 | 4.0 | 84.58 | -3 | 261 | 0 |
| 91457 | 1 | 5.0 | 112.46 | -30 | 567 | 0 |
| 91458 | 1 | 5.0 | 133.69 | -14 | 118 | 1 |
| 91459 | 1 | 5.0 | 71.56 | -16 | 483 | 0 |
91460 rows × 6 columns
df_clust.cluster.value_counts()
1 49270 0 36647 3 4894 2 649 Name: cluster, dtype: int64
df_clust.loc[df_clust.cluster == 0]
| number_order_item | review_score | average_payment_order | late_delivered_days | last_purchase_days | cluster | |
|---|---|---|---|---|---|---|
| 2 | 1 | 3.0 | 86.22 | -2 | 536 | 0 |
| 3 | 1 | 4.0 | 43.62 | -12 | 320 | 0 |
| 4 | 1 | 5.0 | 196.89 | -8 | 287 | 0 |
| 8 | 1 | 4.0 | 150.12 | -28 | 542 | 0 |
| 10 | 1 | 3.0 | 29.00 | -12 | 407 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 91450 | 1 | 4.0 | 55.00 | -7 | 361 | 0 |
| 91453 | 1 | 5.0 | 81.20 | -7 | 302 | 0 |
| 91456 | 1 | 4.0 | 84.58 | -3 | 261 | 0 |
| 91457 | 1 | 5.0 | 112.46 | -30 | 567 | 0 |
| 91459 | 1 | 5.0 | 71.56 | -16 | 483 | 0 |
36647 rows × 6 columns
import plotly.express as px
df = px.data.tips()
fig = px.box(df_clust, y="number_order_item", color="cluster",
notched=True, # used notched shape
title="Cluster sur les commandes clients",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
fig = px.box(df_clust, y="review_score", color="cluster",
notched=True, # used notched shape
title="Cluster sur les satisfactions clients",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
fig = px.box(df_clust, y="average_payment_order", color="cluster",
notched=True, # used notched shape
title="Cluster sur les dépenses clients",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
fig = px.box(df_clust, y="late_delivered_days", color="cluster",
notched=True, # used notched shape
title="Cluster sur les retards de livraisons clients",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
df_clust.cluster.value_counts()
1 49270 0 36647 3 4894 2 649 Name: cluster, dtype: int64
df_clust[df_clust.cluster == 2]
| number_order_item | review_score | average_payment_order | late_delivered_days | last_purchase_days | cluster | |
|---|---|---|---|---|---|---|
| 128 | 1 | 5.0 | 1600.51 | -19 | 149 | 2 |
| 286 | 1 | 5.0 | 2304.68 | -14 | 33 | 2 |
| 411 | 1 | 5.0 | 4016.91 | -10 | 528 | 2 |
| 432 | 1 | 5.0 | 1429.59 | -8 | 166 | 2 |
| 564 | 1 | 5.0 | 1841.11 | 1 | 28 | 2 |
| ... | ... | ... | ... | ... | ... | ... |
| 91116 | 1 | 5.0 | 3048.27 | -20 | 449 | 2 |
| 91121 | 1 | 5.0 | 1672.67 | -12 | 515 | 2 |
| 91193 | 1 | 5.0 | 6726.66 | -23 | 461 | 2 |
| 91258 | 2 | 5.0 | 1568.72 | -16 | 41 | 2 |
| 91369 | 1 | 5.0 | 1626.83 | -23 | 547 | 2 |
649 rows × 6 columns
df_clust[df_clust.cluster == 2].average_payment_order.min(), df_clust[df_clust.cluster == 2].average_payment_order.max()
(1146.2, 6929.31)
df_clust[df_clust.cluster == 1].average_payment_order.min(), df_clust[df_clust.cluster == 2].average_payment_order.max()
(9.59, 6929.31)
df_clust[df_clust.cluster == 0].average_payment_order.min(), df_clust[df_clust.cluster == 2].average_payment_order.max()
(9.34142857142857, 6929.31)
#df['cluster'] = model.labels_
model = KMeans(n_clusters=3)
model.fit(df_clust)
KMeans(n_clusters=3)
df_clust.loc[:,'cluster' ] = model.labels_
df_clust
| number_order_item | review_score | average_payment_order | late_delivered_days | last_purchase_days | cluster | |
|---|---|---|---|---|---|---|
| 0 | 1 | 5.0 | 141.90 | -5 | 111 | 0 |
| 1 | 1 | 4.0 | 27.19 | -5 | 114 | 0 |
| 2 | 1 | 3.0 | 86.22 | -2 | 536 | 1 |
| 3 | 1 | 4.0 | 43.62 | -12 | 320 | 1 |
| 4 | 1 | 5.0 | 196.89 | -8 | 287 | 1 |
| ... | ... | ... | ... | ... | ... | ... |
| 91455 | 2 | 5.0 | 1033.71 | -27 | 446 | 2 |
| 91456 | 1 | 4.0 | 84.58 | -3 | 261 | 1 |
| 91457 | 1 | 5.0 | 112.46 | -30 | 567 | 1 |
| 91458 | 1 | 5.0 | 133.69 | -14 | 118 | 0 |
| 91459 | 1 | 5.0 | 71.56 | -16 | 483 | 1 |
91460 rows × 6 columns
df_clust.cluster.value_counts()
0 50916 1 37889 2 2655 Name: cluster, dtype: int64
df = px.data.tips()
fig = px.box(df_clust, y="number_order_item", color="cluster",
notched=True, # used notched shape
title="Cluster sur les commandes clients",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
df = px.data.tips()
fig = px.box(df_clust, y="average_payment_order", color="cluster",
notched=True, # used notched shape
title="Cluster sur les dépenses clients",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
df = px.data.tips()
fig = px.box(df_clust, y="late_delivered_days", color="cluster",
notched=True, # used notched shape
title="Cluster sur les retards de livraisons",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
df = px.data.tips()
fig = px.box(df_clust, y="review_score", color="cluster",
notched=True, # used notched shape
title="Cluster sur les scores clients",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
# Instantiate the clustering model and visualizer
model = KMeans(2)
visualizer = SilhouetteVisualizer(model)
visualizer.fit(df_clust) # Fit the data to the visualizer
visualizer.poof() # Draw/show/poof the data
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 91460 Samples in 2 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn import metrics
inertie_lst = []
silhouette_lst = []
k = 2
for row in range(3):
print('k =',k)
model = KMeans(n_clusters = k)
model.fit(X_proj)
inertie_lst.append(model.inertia_)
# calcul du coefficient de silhouette
silhouette_lst.append(metrics.silhouette_score(X_tsne, model.labels_))
centers = model.cluster_centers_
k+=1
k = 2 k = 3 k = 4
silhouette_lst
[0.111704454, 0.124344744, 0.113509364]
fig, ax = plt.subplots(1)
ax.set_xticks(np.arange(15))
ax.plot( silhouette_lst)
ax.set_xlabel('k', fontsize=20)
ax.set_ylabel('Silhouette', fontsize=20)
fig.set_figheight(8)
fig.set_figwidth(8)
plt.grid()
from sklearn.cluster import KMeans
model = KMeans(n_clusters=5)
model.fit(X_tsne)
cluster = model.predict(X_tsne)
cluster
array([4, 4, 3, ..., 0, 0, 1])
visualizer = KElbowVisualizer(model, k=(2,8))
visualizer.fit(X_tsne) # Fit the data to the visualizer
visualizer.poof() # Draw/show/poof the data
<AxesSubplot:title={'center':'Distortion Score Elbow for KMeans Clustering'}, xlabel='k', ylabel='distortion score'>
# Instantiate the clustering model and visualizer
model = KMeans(4)
visualizer = SilhouetteVisualizer(model)
visualizer.fit(X_tsne) # Fit the data to the visualizer
visualizer.poof() # Draw/show/poof the data
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 91460 Samples in 4 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
data_x_tnse = pd.DataFrame(X_tsne)
data_x_tnse['cluster'] = cluster
data_x_tnse
| 0 | 1 | cluster | |
|---|---|---|---|
| 0 | -1.135180 | 5.481637 | 4 |
| 1 | -29.377144 | 44.306736 | 4 |
| 2 | -11.386347 | 41.802914 | 3 |
| 3 | -25.837961 | 13.150128 | 4 |
| 4 | 35.942589 | -11.248713 | 2 |
| ... | ... | ... | ... |
| 91455 | 47.699753 | 13.797524 | 2 |
| 91456 | -20.651836 | 41.465958 | 4 |
| 91457 | 9.849265 | -47.742378 | 0 |
| 91458 | 7.628273 | -29.965574 | 0 |
| 91459 | -25.084793 | -33.355484 | 1 |
91460 rows × 3 columns
data_x_tnse.columns
Index([0, 1, 'cluster'], dtype='object')
sns.scatterplot(data = data_x_tnse, x = 0, y= 1,hue = 'cluster')
<AxesSubplot:xlabel='0', ylabel='1'>
from sklearn.cluster import KMeans
df_clust2 = df_outlier.select_dtypes(include=np.number)
k = range(1,5)
inertia = []
for i in k:
model = KMeans(n_clusters=i)
model.fit(df_clust2)
inertia.append(model.inertia_)
inertia
[2246741301.155181, 921199418.7926846, 611620628.081596, 492114342.4343247]
fig, ax = plt.subplots(1,1,figsize=(12,6))
ax.set_ylabel("intertia")
ax.set_xlabel("n_cluster")
ax = plt.plot(k, inertia)
#df['cluster'] = model.labels_
df_clust2 = df_outlier.select_dtypes(include=np.number)
df_clust2 = df_clust2.loc[:,['number_order_item','average_payment_order','last_purchase_days']]
model = KMeans(n_clusters=3)
model.fit(df_clust2)
KMeans(n_clusters=3)
df_clust2['cluster' ] = model.labels_
df_clust2
| number_order_item | average_payment_order | last_purchase_days | cluster | |
|---|---|---|---|---|
| 0 | 1 | 141.90 | 111 | 0 |
| 1 | 1 | 27.19 | 114 | 0 |
| 2 | 1 | 86.22 | 536 | 1 |
| 3 | 1 | 43.62 | 320 | 2 |
| 4 | 1 | 196.89 | 287 | 2 |
| ... | ... | ... | ... | ... |
| 80217 | 1 | 73.16 | 254 | 2 |
| 80218 | 1 | 84.58 | 261 | 2 |
| 80219 | 1 | 112.46 | 567 | 1 |
| 80220 | 1 | 133.69 | 118 | 0 |
| 80221 | 1 | 71.56 | 483 | 1 |
80222 rows × 4 columns
df_clust2.cluster.value_counts()
0 30682 2 30297 1 19243 Name: cluster, dtype: int64
import plotly.express as px
df = px.data.tips()
fig = px.box(df_clust2, y="last_purchase_days", color="cluster",
notched=True, # used notched shape
title="Box plot cluster",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
import plotly.express as px
df = px.data.tips()
fig = px.box(df_clust2, y="number_order_item", color="cluster",
notched=True, # used notched shape
title="Box plot cluster",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
fig = px.box(df_clust2, y="average_payment_order", color="cluster",
notched=True, # used notched shape
title="Box plot cluster",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
df_outlier.describe()
| number_order_item | review_score | average_payment_order | late_delivered_days | last_purchase_days | |
|---|---|---|---|---|---|
| count | 80222.000000 | 80222.000000 | 80222.000000 | 80222.000000 | 80222.000000 |
| mean | 1.128780 | 4.298658 | 104.421378 | -12.116352 | 238.825933 |
| std | 0.527578 | 1.140118 | 66.871755 | 9.571503 | 153.107393 |
| min | 1.000000 | 1.000000 | 6.080000 | -147.000000 | 0.000000 |
| 25% | 1.000000 | 4.000000 | 54.620000 | -17.000000 | 114.000000 |
| 50% | 1.000000 | 5.000000 | 86.501667 | -13.000000 | 221.000000 |
| 75% | 1.000000 | 5.000000 | 137.670000 | -7.000000 | 349.000000 |
| max | 33.000000 | 5.000000 | 436.690000 | 188.000000 | 694.000000 |
# Instantiate the clustering model and visualizer
model = KMeans(3)
visualizer = SilhouetteVisualizer(model)
df_clust2 = df_clust2.loc[:,['number_order_item','average_payment_order','last_purchase_days']]
visualizer.fit(df_clust2) # Fit the data to the visualizer
visualizer.poof() # Draw/show/poof the data
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 80222 Samples in 3 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
# Instantiate the clustering model and visualizer
model = KMeans(2)
visualizer = SilhouetteVisualizer(model)
visualizer.fit(df_clust2) # Fit the data to the visualizer
visualizer.poof() # Draw/show/poof the data
<AxesSubplot:title={'center':'Silhouette Plot of KMeans Clustering for 80222 Samples in 2 Centers'}, xlabel='silhouette coefficient values', ylabel='cluster label'>
scaler2 = StandardScaler()
df_scalled = scaler2.fit_transform(df_outlier.select_dtypes(include = np.number))
df_scalled
array([[-0.24409852, 0.61515247, 0.30484942, 0.74349834],
[-0.24409852, -0.26195525, -1.01228746, 0.74349834],
[-0.24409852, -1.13906297, -0.33448614, 1.0569307 ],
...,
[-0.24409852, 0.61515247, -0.03319007, -1.86843797],
[-0.24409852, 0.61515247, 0.21057957, -0.19679873],
[-0.24409852, 0.61515247, -0.50281694, -0.40575364]])
#df['cluster'] = model.labels_
#df_outlier[df_outlier.total_payment < ]
model = KMeans(n_clusters=2)
model.fit(df_scalled)
KMeans(n_clusters=2)
data_scalled = pd.DataFrame(df_scalled, columns=df_outlier.select_dtypes(include = np.number).columns)
data_scalled
| order_id | review_score | total_payment | late_delivered | |
|---|---|---|---|---|
| 0 | -0.244099 | 0.615152 | 0.304849 | 0.743498 |
| 1 | -0.244099 | -0.261955 | -1.012287 | 0.743498 |
| 2 | -0.244099 | -1.139063 | -0.334486 | 1.056931 |
| 3 | -0.244099 | -0.261955 | -0.823633 | 0.012156 |
| 4 | -0.244099 | 0.615152 | 0.936262 | 0.430066 |
| ... | ... | ... | ... | ... |
| 80217 | -0.244099 | 0.615152 | -0.484445 | -1.137096 |
| 80218 | -0.244099 | -0.261955 | -0.353317 | 0.952453 |
| 80219 | -0.244099 | 0.615152 | -0.033190 | -1.868438 |
| 80220 | -0.244099 | 0.615152 | 0.210580 | -0.196799 |
| 80221 | -0.244099 | 0.615152 | -0.502817 | -0.405754 |
80222 rows × 4 columns
data_scalled['cluster' ] = model.labels_
data_scalled
| order_id | review_score | total_payment | late_delivered | cluster | |
|---|---|---|---|---|---|
| 0 | -0.244099 | 0.615152 | 0.304849 | 0.743498 | 1 |
| 1 | -0.244099 | -0.261955 | -1.012287 | 0.743498 | 1 |
| 2 | -0.244099 | -1.139063 | -0.334486 | 1.056931 | 0 |
| 3 | -0.244099 | -0.261955 | -0.823633 | 0.012156 | 1 |
| 4 | -0.244099 | 0.615152 | 0.936262 | 0.430066 | 1 |
| ... | ... | ... | ... | ... | ... |
| 80217 | -0.244099 | 0.615152 | -0.484445 | -1.137096 | 1 |
| 80218 | -0.244099 | -0.261955 | -0.353317 | 0.952453 | 1 |
| 80219 | -0.244099 | 0.615152 | -0.033190 | -1.868438 | 1 |
| 80220 | -0.244099 | 0.615152 | 0.210580 | -0.196799 | 1 |
| 80221 | -0.244099 | 0.615152 | -0.502817 | -0.405754 | 1 |
80222 rows × 5 columns
data_scalled.cluster.value_counts()
1 67327 0 12895 Name: cluster, dtype: int64
d.cluster.value_counts()
0 61982 1 18240 Name: cluster, dtype: int64
import plotly.express as px
df = px.data.tips()
fig = px.box(df_clust2, y="order_id", color="cluster",
notched=True, # used notched shape
title="Box plot cluster",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
fig = px.box(df_clust2, y="review_score", color="cluster",
notched=True, # used notched shape
title="Box plot cluster",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
fig = px.box(df_clust2, y="total_payment", color="cluster",
notched=True, # used notched shape
title="Box plot cluster",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
import plotly.express as px
df = px.data.tips()
fig = px.box(data_scalled, y="order_id", color="cluster",
notched=True, # used notched shape
title="Box plot cluster",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
fig = px.box(data_scalled, y="review_score", color="cluster",
notched=True, # used notched shape
title="Box plot cluster",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
fig = px.box(data_scalled, y="total_payment", color="cluster",
notched=True, # used notched shape
title="Box plot cluster",
hover_data=["cluster"] # add day column to hover data
)
fig.show()
data_scalled.total_payment.max(),data_scalled.total_payment.min()
(40.06662947607055, -1.2546792312474864)